Data loading

Load required packages

library(Matrix)
library(tidyverse)
library(scales)
library(gridExtra)
library(extrafont)
library(ggrepel)

Read in expression matrix

data_directory <- "../../data/drop-seq/"

expr_readcount_use <-
    sparseMatrix(i = readRDS(file.path(data_directory, "expr_readcount_raw_csc_indices.rds")),
                 p = readRDS(file.path(data_directory, "expr_readcount_raw_csc_indptr.rds")),
                 x = readRDS(file.path(data_directory, "expr_readcount_raw_csc_values.rds")),
                 dims = readRDS(file.path(data_directory, "expr_readcount_raw_csc_shape.rds")),
                 dimnames = readRDS(file.path(data_directory, "expr_readcount_raw_csc_dimnames.rds")),
                 index1 = FALSE)
dim(expr_readcount_use)
## [1] 27999 27416

Load clustering result

tsne_out_coords <- readRDS(file = file.path(data_directory,
                                            "tsne_out_coords.rds"))
head(tsne_out_coords)

Load gene info

gene_symbols <- read.table(file.path("../..",
                                     "data",
                                     "misc",
                                     "genes.tsv"),
                           header = FALSE,
                           row.names = 1,
                           stringsAsFactors = FALSE)
gene_symbols <- setNames(gene_symbols[[1]], rownames(gene_symbols))
head(gene_symbols)
## ENSMUSG00000051951 ENSMUSG00000089699 ENSMUSG00000102343 
##             "Xkr4"           "Gm1992"          "Gm37381" 
## ENSMUSG00000025900 ENSMUSG00000109048 ENSMUSG00000025902 
##              "Rp1"              "Rp1"            "Sox17"
# Normalize
expr_readcount_norm <-
    expr_readcount_use[, Matrix::colSums(expr_readcount_use > 0) >= 200]

expr_readcount_norm <-
    expr_readcount_norm[Matrix::rowSums(expr_readcount_norm > 0) >= 30, ]

expr_readcount_norm <-
    expr_readcount_norm[Matrix::rowSums(expr_readcount_norm) >= 60, ]

expr_readcount_norm@x <- median(Matrix::colSums(expr_readcount_norm)) *
    (expr_readcount_norm@x / rep.int(Matrix::colSums(expr_readcount_norm),
                                     diff(expr_readcount_norm@p)))

Visualization of single cell clustering

Prepare cluster labels

cluster_labels <- tsne_out_coords %>%
    group_by(cluster) %>% summarise(x = mean(x),
                                    y = mean(y)) %>%
        as.data.frame

# adjust several label positions
cluster_labels[13,] <- c(13, -2.5, -60)
cluster_labels[19,] <- c(19, 0.4228625, 27)
cluster_labels[20,] <- c(20, -52.5, -38.5)
cluster_labels[21,] <- c(21, -5.273788, 0)

Preview clustering

(p <-
    ggplot(tsne_out_coords, aes(x, y,
                                color = cluster)) +
    geom_point(size = .6, stroke = 0, shape = 16) +
    annotate("text",
             family = "Arial",
             x = cluster_labels[, "x"],
             y = cluster_labels[, "y"], label = cluster_labels[, 1],
             size = 2,
             color = "black") +
    theme_void() +
    guides(color = FALSE))

Violin plot of marker genes

Calculate CPM

expr_cpm_use <- expr_readcount_use

expr_cpm_use@x <- 1000000 *
    (expr_cpm_use@x / rep.int(Matrix::colSums(expr_cpm_use),
                              diff(expr_cpm_use@p)))
dim(expr_cpm_use)
## [1] 27999 27416

Generate violin plots of Tnnt2, Myl2, Sln, Col1a1, Gata6 and Mef2c

# extract colors from the initial plot to keep colors consistent
p_built <- ggplot_build(p)

p_built_colors <- p_built$data[[1]]
rownames(p_built_colors) <- rownames(tsne_out_coords)

cluster_colors <- data.frame(color = p_built_colors["colour"],
                             cluster = tsne_out_coords[rownames(p_built_colors),
                                                       "cluster"])

cluster_colors <- unique(cluster_colors)
rownames(cluster_colors) <- NULL

# order clusters
clusters_selected <- c(19, 18,
                       9, 6, 11, 4, 8, 21,
                       3, 1, 2, 10, 17, 22, 20,
                       13, 7, 14, 12, 15, 5, 16)

# select genes
genes_selected <- c("ENSMUSG00000026414",
                    "ENSMUSG00000013936",
                    "ENSMUSG00000042045",
                    "ENSMUSG00000001506",
                    "ENSMUSG00000005836",
                    "ENSMUSG00000005583")

# prepare data
expr_violins <-
    do.call(rbind.data.frame,
            lapply(clusters_selected, function(x) {

                cells_in_selected_cluster <-
                    rownames(tsne_out_coords[tsne_out_coords$cluster %in% x,])

                expr <-
                    expr_cpm_use[genes_selected,
                                 cells_in_selected_cluster] %>%
                    as.matrix %>%
                    as.data.frame %>%
                    mutate(gene = rownames(.)) %>%
                    gather(key = "cell",
                           value = "expr",
                           - gene)
                expr$cluster <- x
                return(expr)
            }))

# plot
expr_violins %>%
    mutate(symbol = factor(gene_symbols[as.character(gene)],
                           levels = gene_symbols[genes_selected]),
           cluster = factor(cluster,
                            levels = rev(clusters_selected))) %>%
    ggplot(aes(cluster, log10(expr + 1),
               fill = cluster,
               color = cluster)) +
    geom_violin(trim = TRUE, lwd = .3) +
    theme_bw() +
    coord_flip() +
    stat_summary(fun.y = median, geom = "point", color = "black", size = 1) +
    facet_wrap( ~ symbol, nrow = 1) +
    scale_fill_manual(values = rev(cluster_colors[match(clusters_selected,
                                                        cluster_colors$cluster),
                                                  "colour"])) +
    scale_color_manual(values = rev(cluster_colors[match(clusters_selected,
                                                         cluster_colors$cluster),
                                                   "colour"])) +
    scale_y_continuous(name = expression("log"[10]*"(CPM + 1)"),
                       breaks = c(0, 5)) +
    scale_x_discrete(name = "Cluster") +
    theme(panel.grid.minor = element_blank(),
          panel.grid.major = element_blank()) +
    guides(fill = FALSE,
           color = FALSE) +
    theme(axis.text = element_text(family = "Arial", size = 7),
          axis.title = element_text(family = "Arial", size = 8),
          axis.title.y = element_text(margin = margin(t = 0, r = 0,
                                                      b = 0, l = 0,
                                                      unit = "mm")),
          strip.text.x = element_text(family = "Arial", size = 8),
          legend.title = element_text(family = "Arial", size = 6),
          legend.text = element_text(family = "Arial", size = 6),
          plot.title = element_text(family = "Arial",
                                    size = 6,
                                    hjust = .5,
                                    margin = margin(t = 0, r = 0,
                                                    b = 0, l = 0,
                                                    unit = "mm")),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank())

t-SNE embedding

Format plot

# extract colors from the initial plot
p_built <- ggplot_build(p)

p_built_colors <- p_built$data[[1]]
rownames(p_built_colors) <- rownames(tsne_out_coords)

# get cell groups
cells_reprogr <-
    tsne_out_coords %>% mutate(cell = rownames(.)) %>%
    filter(category %in% c("JD168")) %>% .$cell

cells_uninfected <-
    tsne_out_coords %>% mutate(cell = rownames(.)) %>%
    filter(category %in% c("JD174")) %>% .$cell

cells_primary <-
    tsne_out_coords %>% mutate(cell = rownames(.)) %>%
    filter(! category %in% c("JD168", "JD174")) %>% .$cell

# assign colors
color_info <- p_built_colors
color_info[cells_reprogr, "colour"] <- "#FF5722"
color_info[cells_uninfected, "colour"] <- "#FF5722"

cell_colors <- color_info$colour
names(cell_colors) <- rownames(tsne_out_coords)
cell_colors_uniq <- unique(cell_colors)
names(cell_colors_uniq) <- cell_colors_uniq

tsne_out_coords$color <- cell_colors

Generate plot

cell_type_segments <-
    data.frame(x = c(-40, -40, -40, -40, -69, -35, 40, 40, 40, 65, 28, -12, -57.5, -57.5, -57.5),
               y = c(60, 60, 60, 60, 20, -55, -70, -70, -70, 50, 55, 65, -52.5, -52.5, -52.5),
               xend = c(-30, -44, -60, -30, -69, -35, 32, 0, 40, 65, 35, -20, -17.5, -53, -37.5),
               yend = c(35, 15, -5, -22, 35, -65, -58, -66, -45, 36, 60, 70, -25, -45, -47.5),
               cluster = c(4, 6, 9, 11, 8, 12, 7, 13, 14, 16, 5, 15, 2, 20, 12))

cell_type_labels <-
    data.frame(x = c(-40, -61.5, -43, 52, 56, 46, -22, -58),
               y = c(65, 45, -70, -70, 55, 64, 75, -58),
               label = c("Atrial myocytes",
                         "Ventricular\nmyocytes",
                         "Epicardial cells",
                         "Fibroblasts",
                         "Lymphocytes",
                         "Endothelial cells",
                         "Macrophages",
                         "MEF-derived"))

p_tsne_color_primary <-
    ggplot() +
    geom_point(data = tsne_out_coords, aes(x, y,
                                           color = color),
               size = .6, stroke = 0, shape = 16) +
    scale_color_manual(values = cell_colors_uniq) +
    theme_void() +
    annotate("text",
             family = "Arial",
             x = cluster_labels[, "x"],
             y = cluster_labels[, "y"], label = cluster_labels[, 1],
             parse = TRUE,
             size = 2,
             color = c("black")) +
    guides(color = FALSE,
           shape = FALSE) +
    geom_segment(data = cell_type_segments,
                 aes(x = x, xend = xend, y = y, yend = yend),
                 color = "grey50",
                 size = .2) +
    geom_text(data = cell_type_labels,
              aes(x, y, label = label),
              color = c(rep("black", 2), "#00BFC4", rep("black", 4), "#FF5722"),
              size = 2.8,
              family = "Arial")

p_tsne_color_primary

Distribution of different cells across clusters

# prepare data
cell_distribution_primary <-
    tsne_out_coords[cells_primary,] %>%
    group_by(cluster) %>% summarise(primary.count = n())

cell_distribution_reprogr <-
    tsne_out_coords[cells_reprogr,] %>%
    group_by(cluster) %>% summarise(reprogr.count = n())

cell_distribution_uninfected <-
    tsne_out_coords[cells_uninfected,] %>%
    group_by(cluster) %>% summarise(uninfected.count = n())

# order clusters
clusters_selected <- c(19, 18,
                       9, 6, 11, 4, 8, 21,
                       3, 1, 2, 10, 17, 22, 20,
                       13, 7, 14, 12, 15, 5, 16)

# plot
full_join(cell_distribution_primary, cell_distribution_reprogr,
          by = "cluster") %>%
    full_join(cell_distribution_uninfected, by = "cluster") %>%
    select(primary.count : uninfected.count) %>%
    mutate(primary.count = replace_na(primary.count, 0),
           reprogr.count = replace_na(reprogr.count, 0),
           uninfected.count = replace_na(uninfected.count, 0)) %>%
    `/`(rowSums(.)) %>%
    as.data.frame %>%
    gather(key = category,
           value = ratio) %>%
    mutate(cluster = factor(rep(seq_len(length(levels(tsne_out_coords$cluster))),
                                3),
                            levels = rev(clusters_selected))) %>%
    ggplot() +
    geom_bar(aes(x = cluster, y = ratio, fill = category), stat = "identity") +
    theme_light() +
    scale_x_discrete(name = NULL,
                     breaks = seq_len(length(levels(tsne_out_coords$cluster)))) +
    scale_y_continuous(name = "Percentage",
                       breaks = seq(0, 1, .2),
                       labels = scales::percent) +
    coord_flip() +
    theme(panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_blank(),
          legend.position = "right",
          legend.direction ="vertical",
          legend.title = element_blank(),
          axis.text = element_text(family = "Arial", size = 7),
          axis.title = element_text(family = "Arial", size = 8),
          strip.text.x = element_text(family = "Arial", size = 8),
          legend.text = element_text(family = "Arial", size = 7)) +
    scale_fill_manual(name = NULL,
                      values = c("#00AFBB", "#8BC34A", "#E7B800"),
                      labels = c("Primary", "Reprogr", "Uninfected"))

Visualization of Myod1 and Wt1 expressions

Calculate CPM, if it has not been calculated already

if (! exists("expr_cpm_use")) {
    expr_cpm_use <- expr_readcount_use

    expr_cpm_use@x <- 1000000 *
        (expr_cpm_use@x / rep.int(Matrix::colSums(expr_cpm_use),
                                  diff(expr_cpm_use@p)))
}
dim(expr_cpm_use)
## [1] 27999 27416

t-SNE embedding

grep("Myod1", gene_symbols, ignore.case = TRUE, value = TRUE)
## ENSMUSG00000009471 
##            "Myod1"
selected_gene <- "ENSMUSG00000009471"

p_tsne_Myod1 <-
    tsne_out_coords %>%
    mutate(cell = rownames(.),
           expr = log10(expr_cpm_use[selected_gene, cell] + 1)) %>%
    arrange(expr) %>%
    ggplot(aes(x, y,
               color = expr)) +
    geom_point(size = .4, stroke = 0, shape = 16) +
    scale_color_viridis_c(name = NULL) +
    theme_void() +
    theme(legend.justification = c(1, 0),
          # legend.position = c(.28, .75),
          legend.position = c(.92, .08),
          legend.text = element_text(family = "Arial",
                                     size = 4,
                                     margin = margin(t = 0, r = 0,
                                                     b = 0, l = -1.8,
                                                     unit = "mm")),
          legend.key.size = unit(1.5, "mm"),
          legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")) +
    annotate("text",
             x = -65,
             # x = -72,
             y = 70,
             family = "Arial",
             label = "Myod1\n",
             color = "black",
             size = 2.8,
             vjust = "inward", hjust = "inward")

grep("Wt1", gene_symbols, ignore.case = TRUE, value = TRUE)
## ENSMUSG00000052748 ENSMUSG00000074987 ENSMUSG00000016458 
##             "Swt1"            "Wt1os"              "Wt1"
selected_gene <- "ENSMUSG00000016458"

p_tsne_Wt1 <-
    tsne_out_coords %>%
    mutate(cell = rownames(.),
           expr = log10(expr_cpm_use[selected_gene, cell] + 1)) %>%
    arrange(expr) %>%
    ggplot(aes(x, y,
               color = expr)) +
    geom_point(size = .4, stroke = 0, shape = 16) +
    scale_color_viridis_c(name = NULL) +
    theme_void() +
    theme(legend.justification = c(1, 0),
          # legend.position = c(.28, .75),
          legend.position = c(.92, .08),
          legend.text = element_text(family = "Arial",
                                     size = 4,
                                     margin = margin(t = 0, r = 0,
                                                     b = 0, l = -1.8,
                                                     unit = "mm")),
          legend.key.size = unit(1.5, "mm"),
          legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")) +
    annotate("text",
             x = -65,
             # x = -72,
             y = 70,
             family = "Arial",
             label = "Wt1\n",
             color = "black",
             size = 2.8,
             vjust = "inward", hjust = "inward")

p_combined <- grid.arrange(p_tsne_Myod1,
                           p_tsne_Wt1, nrow = 1)

Bar plot

Define function for data preparation

prepare_lollipop_data <- function(genes,
                                  cells,
                                  expr_cpm,
                                  tolerance = 1e-3) {
    # cells: list

    expr_cpm_selected <-
        expr_cpm[genes, unlist(cells)]

    expr_pct <-
        lapply(seq_len(length(cells)), function(x) {

            cells_in_selected_group <- cells[[x]]

            sapply(genes, function(y) {
                1 - mean(mapply(FUN = all.equal,
                                tolerance = tolerance,
                                expr_cpm_selected[y,
                                                  cells_in_selected_group],
                                0) == TRUE)
            })
        })

    expr_avg <-
        lapply(seq_len(length(cells)), function(x) {
            cells_in_selected_group <- cells[[x]]
            Matrix::rowMeans(expr_cpm_selected[, cells_in_selected_group])
        })

    expr_pct <- do.call(cbind.data.frame, expr_pct)
    expr_avg <- do.call(cbind.data.frame, expr_avg)
    colnames(expr_pct) <- seq_len(length(cells))
    colnames(expr_avg) <-seq_len(length(cells))

    expr_pct$gene <- rownames(expr_pct)
    expr_avg$gene <- rownames(expr_avg)

    expr_pct_long <-
        gather(expr_pct,
               key = group,
               value = expr.pct,
               seq_len(ncol(expr_pct) - 1))

    expr_avg_long <-
        gather(expr_avg,
               key = cluster,
               value = expr.avg,
               seq_len(ncol(expr_avg) - 1))

    expr_pct_avg <- cbind(expr_pct_long,
                          expr_avg_long)

    expr_pct_avg[, c(4, 5)] <- NULL

    expr_pct_avg$gene <- factor(expr_pct_avg$gene,
                                levels = genes)

    return(expr_pct_avg)
}

Myod1

# get cell groups
cells_cluster20_reprogr <-
    tsne_out_coords %>% mutate(cell = rownames(.)) %>%
    filter(cluster == 20 & category %in% ("JD168")) %>% .$cell

cells_cluster20_reprogr_other <-
    cells_reprogr[! cells_reprogr %in% cells_cluster20_reprogr]

cells_selected <- list(cells_cluster20_reprogr,
                       cells_cluster20_reprogr_other,
                       cells_uninfected)

# select genes
genes_selected <- c("ENSMUSG00000009471",
                    "ENSMUSG00000031972",
                    "ENSMUSG00000017300")

# summarize expr info
lollipop_data_use <-
    prepare_lollipop_data(genes = genes_selected,
                          cells = cells_selected,
                          expr_cpm = expr_cpm_use)

# plot
lollipop_data_use %>%
    mutate(symbol = gene_symbols[as.character(gene)],
           expr.avg.log10 = log10(expr.avg + 1)) %>%
    filter(symbol %in% c("Myod1")) %>%
    ggplot(aes(group, expr.avg.log10, fill = group)) +
    geom_bar(stat = "identity", position ="dodge") +
    scale_fill_manual(name = NULL,
                      values = c("#FF5722",
                                 "grey35",
                                 "grey35")) +
    facet_wrap(~ symbol) +
    theme_bw() +
    scale_y_continuous(name = expression(paste("Avg expr (log"[10], " cpm)")),
                       breaks = 0:3,
                       labels = scales::math_format(10 ^ .x)) +
    scale_x_discrete(name = NULL,
                     labels = c("Cluster 20",
                                "Other",
                                "Uninfected")) +
    theme(axis.text = element_text(family = "Arial", size = 7),
          axis.title = element_text(family = "Arial", size = 8),
          axis.text.x = element_text(angle = 90,
                                     vjust = .5,
                                     hjust = 1),
          axis.title.y = element_text(margin = margin(t = 0, r = 0,
                                                      b = 0, l = 0,
                                                      unit = "mm")),
          strip.text.x = element_text(family = "Arial", size = 8),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    guides(fill = FALSE)

Wt1

# get cell groups
cells_cluster12 <-
    tsne_out_coords %>% mutate(cell = rownames(.)) %>%
    filter(cluster == 12) %>% .$cell

cells_cluster12_primary <-
    tsne_out_coords %>% mutate(cell = rownames(.)) %>%
    filter(cluster == 12 & ! category %in% c("JD168", "JD174")) %>% .$cell

cells_cluster12_reprogr <-
    tsne_out_coords %>% mutate(cell = rownames(.)) %>%
    filter(cluster == 12 & category %in% c("JD168")) %>% .$cell

cells_cluster12_reprogr_other <-
    cells_reprogr[! cells_reprogr %in% cells_cluster12_reprogr]

cells_selected <- list(cells_cluster12_primary,
                       cells_cluster12_reprogr,
                       cells_cluster12_reprogr_other,
                       cells_uninfected)

# select genes
genes_selected <- c("ENSMUSG00000016458",
                    "ENSMUSG00000025105")

# summarize expr info
lollipop_data_use <-
    prepare_lollipop_data(genes = genes_selected,
                          cells = cells_selected,
                          expr_cpm = expr_cpm_use)

# plot
lollipop_data_use %>%
    mutate(symbol = factor(gene_symbols[as.character(gene)],
                           levels = c("Wt1", "Bnc1")),
           expr.avg.log10 = log10(expr.avg + 1)) %>%
    ggplot(aes(group, expr.avg.log10, fill = group)) +
    geom_bar(stat = "identity", position ="dodge") +
    scale_fill_manual(name = NULL,
                      values = c("#00BFC4",
                                 "#FF5722",
                                 "grey35",
                                 "grey35")) +
    facet_wrap(~ symbol) +
    theme_bw() +
    scale_y_continuous(name = expression(paste("Avg expr (log"[10], " cpm)")),
                       breaks = 0:3,
                       labels = scales::math_format(10 ^ .x)) +
    scale_x_discrete(name = NULL,
                     labels = c("Primary epi.",
                                "Cluster 12",
                                "Other",
                                "Uninfected")) +
    theme(axis.text = element_text(family = "Arial", size = 7),
          axis.title = element_text(family = "Arial", size = 8),
          axis.text.x = element_text(angle = 90,
                                     vjust = .5,
                                     hjust = 1),
          axis.title.y = element_text(margin = margin(t = 0, r = 0,
                                                      b = 0, l = 0,
                                                      unit = "mm")),
          strip.text.x = element_text(family = "Arial", size = 8),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    guides(fill = FALSE)

Gene ontology term enrichment

Define function

run_binomial_test <- function(expr,
                              expr_cpm = NULL,
                              cell_group_a,
                              cell_group_b = NULL,
                              log2_effect_size_cutoff = 1,
                              positive_frac_cutoff = .2,
                              p.adj_method = "BH") {

    if (is.null(cell_group_b)) {
        cell_group_b <- colnames(expr)[!colnames(expr) %in% cell_group_a]
        cell_group_b_label <- "rest"
        message("cells in group a vs the rest")
    }

    m <- Matrix::rowSums(expr[, cell_group_b] > 0)
    m1 <- m
    m1[m == 0] <- 1

    n <- Matrix::rowSums(expr[, cell_group_a] > 0)

    pv1 <- pbinom(n, length(cell_group_a), m1 / length(cell_group_b),
                  lower.tail = FALSE) + dbinom(n, length(cell_group_a),
                                               m1 / length(cell_group_b))

    expressed_ratio_log_fold <-
        log(n * length(cell_group_b) / (m * length(cell_group_a)), base = 2)

    d1 <- data.frame(log2.effect = expressed_ratio_log_fold,
                     pval = pv1)

    d1 <- d1[d1$log2.effect >= log2_effect_size_cutoff, ]
    d1 <- d1[complete.cases(d1), ]

    n1 <- n
    n1[n == 0] <- 1

    pv2 <-  pbinom(m, length(cell_group_b), n1 / length(cell_group_a),
                   lower.tail = FALSE) + dbinom(m, length(cell_group_b),
                                                n1 / length(cell_group_a))

    d2 <- data.frame(log2.effect = expressed_ratio_log_fold,
                     pval = pv2)

    d2 <- d2[d2$log2.effect <= -log2_effect_size_cutoff, ]
    d2 <- d2[complete.cases(d2), ]

    d <- rbind(d1, d2)

    positive_frac_a <- round(
        Matrix::rowSums(expr[rownames(d),
                             cell_group_a] > 0) / ncol(expr[rownames(d),
                                                            cell_group_a]), 3)

    positive_frac_b <- round(
        Matrix::rowSums(expr[rownames(d),
                             cell_group_b] > 0) / ncol(expr[rownames(d),
                                                            cell_group_b]), 3)

    d$pos.frac.a <- positive_frac_a
    d$pos.frac.b <- positive_frac_b

    transcripts_use <-
        (positive_frac_a >=
             positive_frac_cutoff) | (positive_frac_b >=
                                          positive_frac_cutoff)

    d <- d[transcripts_use, ]
    d <- d[order(abs(d$log2.effect), decreasing = TRUE), ]

    d$norm.reads.mean.a <- Matrix::rowMeans(expr[rownames(d), cell_group_a])
    d$norm.reads.mean.b <- Matrix::rowMeans(expr[rownames(d), cell_group_b])
    d$norm.reads.log2.ratio <-
        log((d$norm.reads.mean.a + .01)/ (d$norm.reads.mean.b + .01),
            base = 2)

    if (! is.null(expr_cpm)) {
        expr_cpm_use <- expr_cpm[rownames(d), ]
        d$cpm.meam.a <- Matrix::rowMeans(expr_cpm_use[, cell_group_a])
        d$cpm.meam.b <- Matrix::rowMeans(expr_cpm_use[, cell_group_b])
        d$cpm.log2.ratio <-
            log((d$cpm.meam.a + .01) / (d$cpm.meam.b + .01), base = 2)
    }

    d$p.adj = p.adjust(d$pval, method = p.adj_method)
    return(d)
}
library(topGO)
packageVersion("topGO")
## [1] '2.34.0'

Enrich GO terms of highly expressed genes in cluster 20

de_paired <- run_binomial_test(expr = expr_readcount_norm,
                               expr_cpm = expr_cpm_use,
                               cell_group_a = cells_cluster20_reprogr,
                               cell_group_b = cells_cluster20_reprogr_other,
                               log2_effect_size_cutoff = 1,
                               positive_frac_cutoff = .2,
                               p.adj_method = "BH")

genes_of_interest <- rownames(subset(de_paired, log2.effect > 0))
gene_universe <- rownames(expr_readcount_norm)
genes_formatted <- factor(as.integer(gene_universe %in% genes_of_interest))
names(genes_formatted) <- gene_universe

topgo_data <- new("topGOdata", ontology = "BP",
                  allGenes = genes_formatted,
                  annot = annFUN.org,
                  mapping = "org.Mm.eg.db",
                  ID = "Ensembl")

topgo_out_classic_fisher <- runTest(topgo_data,
                                    algorithm = "classic",
                                    statistic = "fisher")

num_go_terms <- 15

# prepare data
gentable_use <-
    GenTable(topgo_data,
             classicFisher = topgo_out_classic_fisher,
             topNodes = num_go_terms) %>%
    mutate(classicFisher = as.numeric(str_replace(classicFisher, "< ", "")),
           Term = factor(Term(GO.ID),
                         levels = rev(Term(GO.ID))))

# plot
ggplot(gentable_use, aes(y = -log10(classicFisher), x = Term)) +
    geom_bar(stat = "identity", fill = "grey70") + coord_flip() +
    labs(x = NULL) +
    theme_classic() +
    scale_y_continuous(name = expression(paste("-log"[10]," (p-value)")),
                       breaks = c(0, 30),
                       labels = scales::math_format(10 ^ .x)) +
    annotate("text",
             x = seq_len(nrow(gentable_use)),
             y = rep(1, nrow(gentable_use)),
             label = rev(gentable_use$Term),
             # vjust = "inward",
             hjust = "inward",
             size = 2.5,
             family = "Arial") +
    theme(axis.text = element_text(family = "Arial", size = 7),
          axis.title = element_text(family = "Arial", size = 8),
          legend.title = element_text(family = "Arial", size = 8),
          legend.text = element_text(family = "Arial", size = 8),
          axis.line.y = element_blank(),
          axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank())

Enrichment of exogenous factors in reprogrammed cells

Define function

analyze_gene_enrichment <- function(expr,
                                    cell_group_a,
                                    cell_group_b,
                                    gene_symbols = NULL,
                                    expr_cpm = NULL) {

    m <- Matrix::rowSums(expr[, cell_group_b] > 0)
    m1 <- m
    m1[m == 0] <- 1

    n <- Matrix::rowSums(expr[, cell_group_a] > 0)

    pv1 <- pbinom(n, length(cell_group_a), m1 / length(cell_group_b),
                  lower.tail = FALSE) + dbinom(n, length(cell_group_a),
                                               m1 / length(cell_group_b))

    expressed_ratio_log_fold <-
        log(n * length(cell_group_b) / (m * length(cell_group_a)), base = 2)

    d1 <- data.frame(log2.effect = expressed_ratio_log_fold,
                     pval = pv1)

    d1$pos.frac.a <- Matrix::rowMeans(expr[, cell_group_a] > 0)
    d1$pos.frac.b <- Matrix::rowMeans(expr[, cell_group_b] > 0)

    if (! is.null(gene_symbols)) {
        d1$symbol <- gene_symbols[rownames(d1)]
    }

    if (! is.null(expr_cpm)) {
        d1$mean.cpm.a <- Matrix::rowMeans(expr_cpm[rownames(d1), cell_group_a])
        d1$mean.cpm.b <- Matrix::rowMeans(expr_cpm[rownames(d1), cell_group_b])
    }

    d1$p.adj <- p.adjust(d1$pval, method = "BH")
    return(d1)
}
# select reprogrammed cell clusters
clusters_selected_reprogr <- c(1, 2, 3, 10, 12, 17, 20, 22)

genes_selected_43 <- c("ENSMUSG00000023067",
                       "ENSMUSG00000086369",
                       "ENSMUSG00000042002",
                       "ENSMUSG00000021944",
                       "ENSMUSG00000015627",
                       "ENSMUSG00000005836",
                       "ENSMUSG00000037335",
                       "ENSMUSG00000038193",
                       "ENSMUSG00000028800",
                       "ENSMUSG00000019777",
                       "ENSMUSG00000026313",
                       "ENSMUSG00000040289",
                       "ENSMUSG00000019789",
                       "ENSMUSG00000042258",
                       "ENSMUSG00000063568",
                       "ENSMUSG00000024063",
                       "ENSMUSG00000030557",
                       "ENSMUSG00000079033",
                       "ENSMUSG00000005583",
                       "ENSMUSG00000001419",
                       "ENSMUSG00000020160",
                       "ENSMUSG00000030544",
                       "ENSMUSG00000025930",
                       "ENSMUSG00000048450",
                       "ENSMUSG00000021469",
                       "ENSMUSG00000020542",
                       "ENSMUSG00000009471",
                       "ENSMUSG00000015579",
                       "ENSMUSG00000026923",
                       "ENSMUSG00000030551",
                       "ENSMUSG00000026565",
                       "ENSMUSG00000009739",
                       "ENSMUSG00000001288",
                       "ENSMUSG00000015846",
                       "ENSMUSG00000027833",
                       "ENSMUSG00000024515",
                       "ENSMUSG00000028949",
                       "ENSMUSG00000032419",
                       "ENSMUSG00000000093",
                       "ENSMUSG00000018604",
                       "ENSMUSG00000018263",
                       "ENSMUSG00000020167",
                       "ENSMUSG00000025860")

summary_enrichment_genes_selected <-
    do.call(rbind.data.frame,
            lapply(clusters_selected_reprogr, function(x) {
                cat("Computing cluster", x, "\n")

                cells_in_selected_group <-
                    rownames(tsne_out_coords)[tsne_out_coords$cluster == x &
                        tsne_out_coords$category == "JD168"]
                cells_in_selected_group_other <-
                    cells_reprogr[! cells_reprogr %in% cells_in_selected_group]

                y <- analyze_gene_enrichment(expr = expr_readcount_norm,
                                             cell_group_a = cells_in_selected_group,
                                             cell_group_b = cells_in_selected_group_other,
                                             gene_symbols = gene_symbols[rownames(expr_readcount_norm)],
                                             expr_cpm_use)[genes_selected_43, ]

                y$category <- x
                return(y)
            }))
## Computing cluster 1 
## Computing cluster 2 
## Computing cluster 3 
## Computing cluster 10 
## Computing cluster 12 
## Computing cluster 17 
## Computing cluster 20 
## Computing cluster 22
summary_enrichment_genes_selected_use <-
    summary_enrichment_genes_selected %>%
        filter(category %in% c(20, 12, 2)) %>%
        mutate(category = factor(category,
                                 levels = c(20, 12, 2)))

facet_labels <- paste("Cluster", levels(summary_enrichment_genes_selected_use$category))
names(facet_labels) <- levels(summary_enrichment_genes_selected_use$category)

# plot
ggplot() +
    geom_abline(intercept = 0, slope = 1, linetype = 2) +
    geom_point(data = summary_enrichment_genes_selected_use,
               aes(pos.frac.b,
                   pos.frac.a,
                   size = -log10(p.adj),
                   color = log2.effect),
                   alpha = .8,
                   stroke = 0, shape = 16) +
    facet_wrap(~ category,
               nrow = 1,
               labeller = labeller(category = facet_labels)) +
    coord_fixed() +
    scale_x_continuous(name = "Expr (%, other reprogrammed cells)",
                       limits = c(0, 1), breaks = seq(0, 1, .2)) +
    scale_y_continuous(name = "Expr (%, indicated cluster)",
                       limits = c(0, 1), breaks = seq(0, 1, .2)) +
    scale_color_viridis_c(name = expression(paste("Log"[2], " effect"))) +
    theme_bw() +
    labs(size = expression(paste("-log"[10], " (p-value)"))) +
    theme(axis.text = element_text(family = "Arial", size = 7),
          axis.title = element_text(family = "Arial", size = 8),
          strip.text.x = element_text(family = "Arial", size = 8),
          legend.title = element_text(family = "Arial", size = 8),
          legend.text = element_text(family = "Arial", size = 6),
          legend.position = "bottom",
          legend.box = "horizontal",
          legend.key.size = unit(2.5, "mm"),
          legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    guides(color = guide_colorbar(order = 1),
           size = guide_legend(order = 2)) +
    geom_text_repel(data = subset(summary_enrichment_genes_selected_use,
                                  symbol %in% c("Myod1", "Mef2b",
                                                "Mef2c", "Hand2", "Gata6")),
                    aes(pos.frac.b,
                        pos.frac.a,
                        label = subset(summary_enrichment_genes_selected_use,
                                       symbol %in% c("Myod1", "Mef2b",
                                                     "Mef2c", "Hand2",
                                                     "Gata6"))$symbol),
                    size = 2.5,
                    family = "Arial",
                    box.padding = .2,
                    point.padding = .2,
                    nudge_y = .15,
                    arrow = arrow(length = unit(.02, 'npc')),
                    segment.color = "grey35",
                    color = "black")

Identification of candidate factors

genes_selected_10 <- c("ENSMUSG00000026628",
                       "ENSMUSG00000016458",
                       "ENSMUSG00000025105",
                       "ENSMUSG00000051910",
                       "ENSMUSG00000045680",
                       "ENSMUSG00000038193",
                       "ENSMUSG00000031965",
                       "ENSMUSG00000032419",
                       "ENSMUSG00000005836",
                       "ENSMUSG00000036098")
gene_symbols[genes_selected_10]
## ENSMUSG00000026628 ENSMUSG00000016458 ENSMUSG00000025105 
##             "Atf3"              "Wt1"             "Bnc1" 
## ENSMUSG00000051910 ENSMUSG00000045680 ENSMUSG00000038193 
##             "Sox6"            "Tcf21"            "Hand2" 
## ENSMUSG00000031965 ENSMUSG00000032419 ENSMUSG00000005836 
##            "Tbx20"            "Tbx18"            "Gata6" 
## ENSMUSG00000036098 
##             "Myrf"
clusters_selected <- levels(tsne_out_coords$cluster)[
    ! levels(tsne_out_coords$cluster) %in% c(1, 2, 3, 10, 17, 20, 22)]

cells_selected_lollipop <-
    lapply(clusters_selected, function(x) {
        rownames(tsne_out_coords)[tsne_out_coords$cluster == x
                                  & (! tsne_out_coords$category %in%
                                         c("JD168", "JD174"))]
    })
names(cells_selected_lollipop) <- clusters_selected
cells_selected_lollipop$uninfected <- cells_uninfected

expr_pct_avg_lollipop <-
    prepare_lollipop_data(genes = genes_selected_10,
                          cells = cells_selected_lollipop,
                          expr_cpm = expr_cpm_use,
                          tolerance = 1e-3)

# set y axis label colors
colors_axis_labels_y <- unique(p_built_colors$colour)
colors_axis_labels_y <- setNames(colors_axis_labels_y, unique(p_built_colors$group))
colors_axis_labels_y <- c(colors_axis_labels_y, uninfected = "grey50")

# set y axis labels
axis_labels_y <- c("Epicardial cell",
                   "Uninfected MEF",
                   "CM 2",
                   "CM 1",
                   "Atrial CM",
                   "Atrial CM",
                   "Atrial CM",
                   "Atrial CM",
                   "Ventricular CM",
                   "CM 3",
                   "Cardiac fibroblast",
                   "Cardiac fibroblast",
                   "Cardiac fibroblast",
                   "Macrophage",
                   "Endothelial cell",
                   "Lymphocyte")

# order y axis
axis_y <- c("12",
            "uninfected",
            "19",
            "18",
            "9",
            "6",
            "11",
            "4",
            "8",
            "21",
            "13",
            "7",
            "14",
            "15",
            "5",
            "16")

expr_pct_avg_lollipop %>%
    mutate(group = factor(names(cells_selected_lollipop)[as.integer(group)],
                          levels = rev(axis_y)),
           symbol = gene_symbols[as.character(gene)]) %>%
    ggplot(aes(symbol,
               group,
               size = expr.pct,
               color = log(expr.avg + 1, 10))) +
    geom_point() +
    guides(color = guide_colorbar(order = 1),
           size = guide_legend(order = 2)) +
    scale_color_viridis_c(name = expression(paste("Avg expr (log"[10], " cpm)")),
                          limits = c(0, 3)) +
    theme_light() +
    scale_x_discrete(name = NULL, position = "top") +
    scale_y_discrete(name = NULL, label = rev(axis_labels_y)) +
    scale_size(name = "% cells expressing gene",
               limits = c(0, 1),
               breaks = seq(0, 1, .2),
               range = c(0, 4)) +
    theme(axis.text = element_text(family = "Arial", size = 8),
          axis.title = element_text(family = "Arial", size = 6),
          axis.text.x = element_text(angle = - 45,
                                     vjust = .5,
                                     hjust = 1),
          axis.text.y = element_text(color = colors_axis_labels_y[rev(axis_y)]),
          legend.title = element_text(family = "Arial", size = 8),
          legend.text = element_text(family = "Arial", size = 8),
          legend.position = "bottom",
          legend.box = "vertical",
          legend.key.size = unit(2.5, "mm"),
          legend.margin = margin(t = -2, r = 15, b = 0, l = -5, unit = "mm"))

Information about the current R session

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin18.2.0 (64-bit)
## Running under: macOS Mojave 10.14.3
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.5/lib/libopenblas_haswellp-r0.3.5.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] org.Mm.eg.db_3.7.0   topGO_2.34.0         SparseM_1.77        
##  [4] GO.db_3.7.0          AnnotationDbi_1.44.0 IRanges_2.16.0      
##  [7] S4Vectors_0.20.1     Biobase_2.42.0       graph_1.60.0        
## [10] BiocGenerics_0.28.0  ggrepel_0.8.0        extrafont_0.17      
## [13] gridExtra_2.3        scales_1.0.0.9000    forcats_0.4.0.9000  
## [16] stringr_1.4.0.9000   dplyr_0.8.0.9006     purrr_0.3.1         
## [19] readr_1.3.1.9000     tidyr_0.8.3.9000     tibble_2.0.1.9001   
## [22] ggplot2_3.1.0.9000   tidyverse_1.2.1.9000 Matrix_1.2-15       
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.0.9000      bit64_0.9-7          jsonlite_1.6        
##  [4] viridisLite_0.3.0    modelr_0.1.4         assertthat_0.2.0    
##  [7] blob_1.1.1           cellranger_1.1.0     yaml_2.2.0          
## [10] Rttf2pt1_1.3.7       pillar_1.3.1.9000    RSQLite_2.1.1       
## [13] backports_1.1.3      lattice_0.20-38      glue_1.3.0.9000     
## [16] extrafontdb_1.0      digest_0.6.18        rvest_0.3.2         
## [19] colorspace_1.4-0     htmltools_0.3.6      pkgconfig_2.0.2     
## [22] broom_0.5.1.9000     haven_2.1.0          generics_0.0.2      
## [25] withr_2.1.2          lazyeval_0.2.1       cli_1.0.1           
## [28] magrittr_1.5.0.9000  crayon_1.3.4         readxl_1.3.0.9000   
## [31] memoise_1.1.0        evaluate_0.13        fs_1.2.6.9000       
## [34] xml2_1.2.0           tools_3.5.2          hms_0.4.2.9001      
## [37] matrixStats_0.54.0   munsell_0.5.0        reprex_0.2.1.9000   
## [40] compiler_3.5.2       rlang_0.3.1.9000     grid_3.5.2          
## [43] rstudioapi_0.9.0     labeling_0.3         rmarkdown_1.11      
## [46] gtable_0.2.0         DBI_1.0.0            R6_2.4.0            
## [49] lubridate_1.7.4.9000 knitr_1.21           bit_1.1-14          
## [52] zeallot_0.1.0        stringi_1.3.1        Rcpp_1.0.0          
## [55] vctrs_0.1.0.9002     dbplyr_1.3.0         tidyselect_0.2.5    
## [58] xfun_0.5